In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno

import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv('EcomShipping.csv')
df.head(5).T
Out[2]:
0 1 2 3 4
ID 1 2 3 4 5
Warehouse_block D F A B C
Mode_of_Shipment Flight Flight Flight Flight Flight
Customer_care_calls 4 4 2 3 2
Customer_rating 2 5 2 3 2
Cost_of_the_Product 177 216 183 176 184
Prior_purchases 3 2 4 4 3
Product_importance low low low medium medium
Gender F M M M F
Discount_offered 44 59 48 10 46
Weight_in_gms 1233 3088 3374 1177 2484
Reached.on.Time_Y.N 1 1 1 1 1
In [3]:
df['Reached.on.Time_Y.N'] = df['Reached.on.Time_Y.N'].astype(str)
In [4]:
# Categoric data
df.describe(include='O')
Out[4]:
Warehouse_block Mode_of_Shipment Product_importance Gender Reached.on.Time_Y.N
count 10999 10999 10999 10999 10999
unique 5 3 3 2 2
top F Ship low F 1
freq 3666 7462 5297 5545 6563
In [5]:
# print out the categoric columns and its category 
for c in df.select_dtypes(exclude='number').columns.tolist():
    print(c, sorted(df[c].unique()))
Warehouse_block ['A', 'B', 'C', 'D', 'F']
Mode_of_Shipment ['Flight', 'Road', 'Ship']
Product_importance ['high', 'low', 'medium']
Gender ['F', 'M']
Reached.on.Time_Y.N ['0', '1']
In [6]:
# Numeric data
df.describe()
Out[6]:
ID Customer_care_calls Customer_rating Cost_of_the_Product Prior_purchases Discount_offered Weight_in_gms
count 10999.00000 10999.000000 10999.000000 10999.000000 10999.000000 10999.000000 10999.000000
mean 5500.00000 4.054459 2.990545 210.196836 3.567597 13.373216 3634.016729
std 3175.28214 1.141490 1.413603 48.063272 1.522860 16.205527 1635.377251
min 1.00000 2.000000 1.000000 96.000000 2.000000 1.000000 1001.000000
25% 2750.50000 3.000000 2.000000 169.000000 3.000000 4.000000 1839.500000
50% 5500.00000 4.000000 3.000000 214.000000 3.000000 7.000000 4149.000000
75% 8249.50000 5.000000 4.000000 251.000000 4.000000 10.000000 5050.000000
max 10999.00000 7.000000 5.000000 310.000000 10.000000 65.000000 7846.000000

Data cleaning (check)¶

In [7]:
msno.bar(df, color = 'orange')
plt.title('Checking for Missing Values\n', fontsize = 40)
plt.show()
No description has been provided for this image

heatmap of the data for checking the correlation between the features and target column.¶

In [8]:
plt.figure(figsize = (18, 7))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, fmt='0.2f', 
            annot_kws={'size': 15}, linewidth=2, linecolor='orange')
plt.show()
No description has been provided for this image

Checking value counts of columns¶

In [28]:
# create columns list to check
cols = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating',
        'Prior_purchases', 'Product_importance', 'Gender', 'Reached.on.Time_Y.N']

plt.figure(figsize = (16, 20))

# plotting the countplot of each categorical column.
for i, col in enumerate(cols):
    if i <= 8:
        ax = plt.subplot(4, 2, i+1)
        sns.countplot(x = col, data = df, ax = ax, palette='rocket')
        plt.title(f"\n{col} Value Counts\n", fontsize = 20)

plt.tight_layout()
plt.show()
No description has been provided for this image

Exploring relation of categorical columns¶

In [10]:
object_columns = df.select_dtypes(include = ['object'])
object_columns.sample(5)
Out[10]:
Warehouse_block Mode_of_Shipment Product_importance Gender Reached.on.Time_Y.N
181 F Ship low M 1
9942 D Ship low F 1
9407 F Ship medium M 0
141 B Flight low M 1
1722 D Ship medium F 1

Warehouse column and categories proportion¶

In [11]:
warehouse = object_columns['Warehouse_block'].value_counts().reset_index()
warehouse.columns = ['warehouse', 'value_counts']
fig = px.pie(warehouse, names = 'warehouse', values = 'value_counts', 
             color_discrete_sequence = px.colors.sequential.matter_r, width = 650, height = 400,
             hole = 0.5)
fig.update_traces(textinfo = 'percent+label')

Reach on time count in warehouse block¶

In [12]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(data = df, x='Warehouse_block', hue = df['Reached.on.Time_Y.N'],  
              palette='rocket')
plt.show()
No description has been provided for this image

Gender¶

In [13]:
gender = object_columns['Gender'].value_counts().reset_index()
gender.columns = ['Gender', 'value_counts']
fig = px.pie(gender, names = 'Gender', values = 'value_counts',  
             width = 650, height = 400, hole = 0.4, 
             color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_traces(textinfo = 'percent+label')
In [14]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(x='Gender', hue = 'Reached.on.Time_Y.N', data = df, palette='rocket')
plt.show()
No description has been provided for this image

Shipment method¶

In [15]:
mode = object_columns['Mode_of_Shipment'].value_counts().reset_index()
mode.columns = ['Mode_of_Shipment', 'value_counts']
fig = px.pie(mode, names = 'Mode_of_Shipment', values = 'value_counts', 
             color_discrete_sequence = px.colors.sequential.RdBu, 
             width = 650, height = 400, hole = 0.5)
fig.update_traces(textinfo = 'percent+label')

Reach on time vs Shipping method¶

In [16]:
plt.figure(figsize = (17, 6))
sns.countplot(x='Mode_of_Shipment', hue = 'Reached.on.Time_Y.N', 
              data = df, palette='rocket')
plt.show()
No description has been provided for this image

Product importance¶

In [17]:
product_imp = object_columns['Product_importance'].value_counts().reset_index()
product_imp.columns = ['Product_importance', 'value_counts']
fig = px.pie(product_imp, names = 'Product_importance', values = 'value_counts',
             color_discrete_sequence = px.colors.sequential.RdBu, 
             width = 650, height = 400, hole = 0.5)
fig.update_traces(textinfo = 'percent+label')

Product importance Shipment on time¶

In [18]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(x='Product_importance', hue = 'Reached.on.Time_Y.N', 
              data = df, palette='rocket')
plt.show()
No description has been provided for this image

Relation of continuous columns with on time or not¶

In [19]:
integer_columns = df.select_dtypes(include = ['int64'])
integer_columns.head()
Out[19]:
ID Customer_care_calls Customer_rating Cost_of_the_Product Prior_purchases Discount_offered Weight_in_gms
0 1 4 2 177 3 44 1233
1 2 4 5 216 2 59 3088
2 3 2 2 183 4 48 3374
3 4 3 3 176 4 10 1177
4 5 2 2 184 3 46 2484

Customer care calls¶

In [20]:
customer_care = integer_columns['Customer_care_calls'].value_counts().reset_index()
customer_care.columns = ['Customer_care_calls', 'value_counts']
fig = px.pie(customer_care, names = 'Customer_care_calls', 
             values = 'value_counts', width = 650, height = 400,
             color_discrete_sequence = px.colors.sequential.RdBu, hole = 0.4)
fig.update_traces(textinfo = 'percent+label')

Customer call vs Shipment on time¶

In [21]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(data = df, x='Customer_care_calls',   
              palette='rocket', hue = 'Reached.on.Time_Y.N')
plt.show()
No description has been provided for this image

Customers' rating¶

In [22]:
customer_ratings = integer_columns['Customer_rating'].value_counts().reset_index()
customer_ratings.columns = ['Customer_rating', 'value_counts']
fig = px.pie(customer_ratings, names = 'Customer_rating', values = 'value_counts',
             color_discrete_sequence = px.colors.sequential.RdBu, 
             width = 650, height = 400, hole = 0.4)
fig.update_traces(textinfo = 'percent+label')

Customers rating vs Shipment on time¶

In [23]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(x='Customer_rating', hue = 'Reached.on.Time_Y.N', 
              data = df, palette='rocket')
plt.show()
No description has been provided for this image

Prior purchase¶

In [24]:
prior_purchases = integer_columns['Prior_purchases'].value_counts().reset_index()
prior_purchases.columns = ['Prior_purchases', 'value_counts']
fig = px.pie(prior_purchases, names = 'Prior_purchases', values = 'value_counts',
             color_discrete_sequence = px.colors.sequential.RdBu, 
             width = 650, height = 400, hole = 0.4)
fig.update_traces(textinfo = 'percent+label')

Prior purchase vs shipment on time¶

In [25]:
# 1 : NOT on time and 0: on time
plt.figure(figsize = (17, 6))
sns.countplot(x='Prior_purchases', hue = 'Reached.on.Time_Y.N', data = df, palette='rocket')
plt.show()
No description has been provided for this image

Reach on time vs not on time¶

In [31]:
# 1 : NOT on time and 0: on time
reached_on_time_y_n = df['Reached.on.Time_Y.N'].value_counts().reset_index()
reached_on_time_y_n.columns = ['Reached.on.Time_Y.N', 'value_counts']
fig = px.pie(reached_on_time_y_n, names = 'Reached.on.Time_Y.N', values = 'value_counts',
             color_discrete_sequence = px.colors.sequential.RdBu, 
             width = 650, height = 400, hole = 0.4)
fig.update_traces(textinfo = 'percent+label')

Cost of the product¶

In [32]:
plt.figure(figsize = (15, 7))
ax = sns.histplot(df['Cost_of_the_Product'], bins = 100, color = 'orange', kde=True)

plt.show()
No description has been provided for this image

relation between cost of the product and shipment on time¶

In [33]:
# 1 : NOT on time and 0: on time
px.box(data_frame = df, x = 'Reached.on.Time_Y.N', y = 'Cost_of_the_Product', 
       color = 'Reached.on.Time_Y.N' )
In [ ]: